(This covers data from 2017-04-30 to 2017-05-07)
Jump down the results section, or a specific probe within the results section:
Each probe is broken down by OS (Darwin, Linux, Windows)
%%html
<button onClick=code_toggle()>Toggle Code</button>
<script>
var code_show=true; //true -> hide code at first
function code_toggle() {
$('div.prompt').hide(); // always hide prompt
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
from __future__ import division
import ujson as json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
#from colour import Color
import math
import plotly.plotly as py
import IPython
import pyspark.sql.functions as fun
import pyspark.sql.types as st
from pyspark.sql import Row
from collections import Counter, defaultdict
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test
IPython.core.pylabtools.figsize(16, 7)
sns.set_style('whitegrid')
sc.setLogLevel('INFO')
from operator import add
pd.set_option("display.max_rows", None)
def get_active_addon_info(addons_str):
""" Return a list of currently enabled add-ons in the form (GUID, name, version, isSystem). """
addons = json.loads(addons_str)
addons = addons.get("activeAddons", {})
if not addons:
return []
return [(guid, meta.get("name"), meta.get("isSystem"), meta.get('isWebExtension'), meta.get('version')) for guid, meta in addons.iteritems()]
def get_top_addons(df, cohort_filter, n_top=100):
cohort_num, cohort_table = dataset_installed_addons(
df.filter(cohort_filter),
n_top=n_top)
print("There were {:,} distinct add-ons installed across the '{}' cohort."\
.format(cohort_num, cohort_filter))
cohort_table["n_installs"] = cohort_table["n_installs"]
cohort_table["pct_installed"] = cohort_table["pct_installed"]
return cohort_table
def dataset_installed_addons(data, n_top=100):
""" Extract add-on info from a subset of the main dataset, and generate a table of top add-ons
with installation counts.
Returns a Pandas DataFrame.
"""
data_addons = data.select("addons").rdd.map(lambda row: row["addons"])
data_addons.cache()
n_in_data = data_addons.count()
## Get counts by add-on ID/name/isSystem value.
addon_counts = data_addons.flatMap(get_active_addon_info)\
.map(lambda a: (a, 1))\
.reduceByKey(add)\
.map(lambda ((guid, name, sys, we, version), n): (guid, (name, sys, version, n)))
## Summarize using the most common name and isSystem value.
top_vals = addon_counts.reduceByKey(lambda a, b: a if a[-1] > b[-1] else b)\
.map(lambda (guid, (name, sys, we, version, n)): (guid, (name, sys, version)))
n_installs = addon_counts.mapValues(lambda (name, sys, version, we, n): n)\
.reduceByKey(add)
addon_info = top_vals.join(n_installs)\
.map(lambda (guid, ((name, sys, we, version), n)): {
"guid": guid,
"name": name,
"is_system": sys,
"is_webextension": we,
"version":version,
"n_installs": n,
"pct_installed": n / n_in_data * 100
})\
.sortBy(lambda info: info["n_installs"], ascending=False)
addon_info_coll = addon_info.collect() if not n_top else addon_info.take(n_top)
addon_info_table = pd.DataFrame(addon_info_coll)
addon_info_table = addon_info_table[["guid", "name", "version","is_system", "is_webextension", "n_installs", "pct_installed"]]
## Number rows from 1.
addon_info_table.index += 1
n_addons = addon_info.count()
data_addons.unpersist()
return (n_addons, addon_info_table)
def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
""" The comparison metric for histograms. """
histA = xs.sum(axis=0)
histB = ys.sum(axis=0)
if normalize:
histA = histA/histA.sum()
histB = histB/histB.sum()
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
return d
def median_diff(xs, ys):
return np.median(xs) - np.median(ys)
def median(lst, n):
# take average of middle two numbers if even
cut = int(np.ceil(n/2))
return (lst[cut-1] + lst[cut]) / 2 if n % 2 == 0 else lst[cut-1]
def make_group_histogram(group_data):
""" Combine separate client histograms into a single group histogram, normalizing bin counts
to relative frequencies.
"""
N = len(group_data)
def median(lst, n):
# take average of middle two numbers if even
cut = int(np.ceil(n/2))
return (lst[cut-1] + lst[cut]) / 2 if n % 2 == 0 else lst[cut-1]
## Check for histograms with 0 counts.
client_totals = group_data.map(lambda x: x.sum())
group_data = group_data[client_totals > 0]
raw_counts = group_data.sum()
# compute median
n = raw_counts.sum()
acc = -0.001
curr_dec = 0
deciles = []
dec = 0
for i, j in raw_counts.iteritems():
acc += j
while acc >= curr_dec:
deciles.append(i)
dec+=.1
curr_dec = n*dec
## Convert frequency counts to relative frequency for each client histogram.
group_data = group_data.map(lambda x: x/x.sum())
## Merge the group's client histograms by adding up the frequencies over all clients
## in the group, separately for each bin.
group_data = group_data.sum()
## Convert the merged bin frequencies to relative percentages.
group_data= 100 * group_data / group_data.sum()
return group_data, deciles, N
def compare_histogram(histogram, webext_multi_1, webext_multi_4, multi_1=None, multi_4=None,
include_diff=False, include_diff_in_diff=False, did_separate_plot=False):
""" Compare an e10s histogram to a non-e10s one, and graph the results.
Plots the two histograms overlaid on the same graph, and prints a p-value
for testing whether they are different. If 'include_diff' is True, also
draw a plot of the frequency differences for each bin.
If 'include_diff_in_diff' is True and data is supplied, include a plot of
differences between addon cohort differences and non-addon cohort differences.
"""
multi_1_total, dec1, n1 = make_group_histogram(webext_multi_1)
multi_4_total, dec4, n4 = make_group_histogram(webext_multi_4)
multi_1_total_std, dec_std1, n1_std = make_group_histogram(multi_1)
multi_4_total_std, dec_std4, n4_std = make_group_histogram(multi_4)
ret = {
"dec_1": dec1,
"dec_4": dec4,
"n_1": n1,
"n_4": n4,
'dec_1_std': dec_std1,
'dec_4_std': dec_std4,
'n_1_std': n1_std,
'n_4_std': n4_std
}
if include_diff:
if include_diff_in_diff and did_separate_plot:
fig, (ax, diff_ax, diff_diff_ax) = plt.subplots(3, sharex=True, figsize=(16,10),
gridspec_kw={"height_ratios": [2,2,1]})
else:
fig, (ax, diff_ax) = plt.subplots(2, sharex=True)
else:
for comparison in [[multi_1_total, multi_4_total, webext_multi_1, webext_multi_4, 'With WebExtensions'],
[multi_1_total_std, multi_4_total_std, multi_1, multi_4, "Without WebExtensions"]]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
fig.subplots_adjust(hspace=0.3)
ax2 = ax.twinx()
width = 0.4
ylim = max(multi_1_total.max(), multi_4_total.max())
m1, m4, o1, o4, hist_desc = comparison
m1.plot(kind="bar", alpha=0.5, color="green", label="Multi 1", ax=ax, width=width,
position=1, ylim=(0, ylim + 1))
m4.plot(kind="bar", alpha=0.5, color="blue", label="Multi 4", ax=ax2, width=width,
position=0, grid=False, ylim=ax.get_ylim())
## Combine legend info from both Axes.
ax_h, ax_l = ax.get_legend_handles_labels()
ax2_h, ax2_l = ax2.get_legend_handles_labels()
ax.legend(ax_h + ax2_h, ax_l + ax2_l, loc = 0)
ax.xaxis.grid(False)
ax.set_ylabel("Frequency %")
# Only display at most 100 tick labels on the x axis.
xticklabs = plt.gca().get_xticklabels()
max_x_ticks = 100
if len(xticklabs) > max_x_ticks:
step_size = math.ceil(float(len(xticklabs)) / max_x_ticks)
for i, tl in enumerate(xticklabs):
if i % step_size != 0:
tl.set_visible(False)
## Compute a p-value for the chi-square distance between the groups' combined histograms.
pvalue = grouped_permutation_test(chi2_distance, [o1, o4], num_samples=100)
if "pvalue_14" not in ret:
ret["pvalue_14"] = pvalue
else:
ret["pvalue_14_std"] = pvalue
print_with_markdown("""---\n#### {} {}\nProbability the two histograms differ by chance is
<span style="color:{}">**{:.3f}**.</span>
""".format(histogram, hist_desc ,"red" if pvalue <= .05 else "green", pvalue))
plt.show()
# if include_diff:
# ## Add a second barplot of the difference in frequency for each bucket.
# #diff_ax = fig.add_subplot(2, 1, 2)
# enDiff = multi_1_total - multi_4_total
# has_diff_in_diff_data = (multi_1 is not None and len(multi_1) > 0 and
# multi_4 is not None and len(multi_4) > 0)
# if include_diff_in_diff and has_diff_in_diff_data:
# ## Add bin differences for between e10s/non-e10s for the no-addons cohorts.
# ## The assumption is that the difference between addons cohorts would look the same
# ## if there is no additional effect of having addons.
# multi_1_total_std, dec_std1, n1_std = make_group_histogram(multi_1)
# multi_4_total_std, dec_std4, n4_std = make_group_histogram(multi_4)
# enDiff_std = multi_1_total_std - multi_4_total_std
# ylims = (min(enDiff.min(), enDiff_std.min()) - 0.5, max(enDiff.max(), enDiff_std.max()) + 0.5)
# diff_ax2 = diff_ax.twinx()
# enDiff.plot(kind="bar", alpha=0.5, color="navy", label="WebExtensions", ax=diff_ax, width=width,
# position=1, ylim=ylims)
# enDiff_std.plot(kind="bar", alpha=0.5, color="gray", label="no WebExtensions", ax=diff_ax2, width=width,
# position=0, grid=False, ylim=diff_ax.get_ylim())
# ## Combine legend info from both Axes.
# diff_ax_h, diff_ax_l = diff_ax.get_legend_handles_labels()
# diff_ax2_h, diff_ax2_l = diff_ax2.get_legend_handles_labels()
# leg_h = diff_ax_h + diff_ax2_h
# leg_l = diff_ax_l + diff_ax2_l
# if did_separate_plot:
# enDiffDiff = enDiff - enDiff_std
# enDiffDiff.plot(kind="bar", alpha=0.5, color="maroon", ax=diff_diff_ax, ylim=diff_ax.get_ylim())
# diff_diff_ax.xaxis.grid(False)
# diff_diff_ax.set_ylabel("Diff in freq %")
# diff_diff_ax.set_title("Diff between multi 1/4 with webextensions and multi 1/4 diff without webextensions" +
# " (with webextensions higher when > 0)")
# else:
# if include_diff_in_diff:
# ## We wanted to do the additional comparison, but there wasn't enough data.
# print("\nNo diff-in-diff comparison: one of the standard cohorts has no non-missing observations.")
# enDiff.plot(kind="bar", alpha=0.5, color="navy", label="WebExtensions", ax=diff_ax)
# leg_h, leg_l = diff_ax.get_legend_handles_labels()
# plt.title("multi1/multi4 difference (more multi1 in bucket when > 0)")
# diff_ax.xaxis.grid(False)
# diff_ax.set_ylabel("Diff in frequency %")
# diff_ax.legend(leg_h, leg_l, loc = 0)
# if include_diff_in_diff:
# pvalue_std = grouped_permutation_test(chi2_distance, [multi_1, multi_4], num_samples=1000)
# print("The probability that the distributions for {} (without webextensions)\nare differing by chance is {:.3f}."\
# .format(histogram, pvalue_std))
return ret
def normalize_uptime_hour(frame):
""" Convert metrics to rates per hour of uptime. """
frame = frame[frame["payload/simpleMeasurements/totalTime"] > 60]
removed_os = False
os = None
try:
os = frame["system/os/name"]
frame = frame.drop("system/os/name", axis=1)
removed_os = True
except:
pass
frame = 60 * 60 * frame.apply(lambda x: x / frame["payload/simpleMeasurements/totalTime"]) # Metric per hour
frame.drop('payload/simpleMeasurements/totalTime', axis=1, inplace=True)
if removed_os:
frame["system/os/name"] = os
return frame
def compare_e10s_count_histograms(pings, cohort_sizes = {}, *histogram_names, **kwargs):
""" Read multiple count histograms from a collection of pings, and compare e10s/non-e10s for each.
Treats count histograms as scalars for comparison purposes, without distinguishing between
parent and child processes. Expects a dict containing overall cohort sizes
for computing sample size proportions.
"""
properties = histogram_names + ("payload/simpleMeasurements/totalTime", "cohort", "system/os/name")
frame = pd.DataFrame(get_pings_properties(pings, properties).collect())
ret = defaultdict(dict)
for os in np.unique(frame['system/os/name']):
print_with_markdown("---")
print_with_markdown("# {}".format(os))
frame_os = frame[frame['system/os/name'] == os]
# multi with webextensions
we_multi1 = normalize_uptime_hour(
frame_os[frame_os.cohort=="webextensions-multiBucket1"].drop("cohort",axis=1))
we_multi4 = normalize_uptime_hour(
frame_os[frame_os.cohort=="webextensions-multiBucket4"].drop("cohort", axis=1))
include_diff_in_diff = kwargs.get("include_diff_in_diff", True)
if include_diff_in_diff:
multi1 = normalize_uptime_hour(frame_os[frame_os.cohort=="multiBucket1"].drop("cohort", axis=1))
multi4 = normalize_uptime_hour(frame_os[frame_os.cohort=="multiBucket4"].drop("cohort", axis=1))
for histogram in histogram_names:
if histogram not in multi1.columns:
continue
## Remove the property path from the histogram name for display purposes.
hist_name = hist_base_name(histogram)
if type(hist_name) == list:
## Key was given for keyed histogram.
hist_str = "{}/{}".format(link_to_histogram(hist_name[0]), hist_name[1])
hist_name = hist_name[0]
else:
hist_str = hist_name
## Print a header for the block of graphs, including a link to the histogram definition.
print_with_markdown("Comparison for count histogram {}:".format(hist_str))
we_multi1_hist = we_multi1[[histogram, 'system/os/name']].dropna()
we_multi4_hist = we_multi4[[histogram, 'system/os/name']].dropna()
## Print some information on sample sizes.
print("{} Multi 4 profiles (with webextenisons) have this histogram.".format(
sample_size_str(len(we_multi4_hist), cohort_sizes.get(WEBEXTENSION_MULTI_4))))
print("{} Multi 1 profiles (with webextensions) have this histogram.".format(
sample_size_str(len(we_multi1_hist), cohort_sizes.get(WEBEXTENSION_MULTI_1))))
## If either group has no data, nothing more to do.
if len(we_multi4_hist) == 0 or len(we_multi1_hist) == 0:
continue
print("")
hist_name = histogram.split('/')[-1]
ret[os][hist_name] = compare_scalars(hist_name + " per hour", we_multi1_hist, we_multi4_hist,
multi1[[histogram, 'system/os/name']].dropna() if include_diff_in_diff else None,
multi4[[histogram, 'system/os/name']].dropna() if include_diff_in_diff else None)
return ret
def compare_e10s_histograms(pings, cohort_sizes = {}, *histogram_names, **kwargs):
""" Read multiple histograms from a collection of pings, and compare multi1 /
multi4 + webextenson / no webextension for each.
Outputs separate comparisons for parent process, child processes, and merged histograms.
Expects a dict containing overall cohort sizes for computing sample
size proportions.
"""
## Load histogram data from the ping set, separating parent & child processes for e10s.
has_hist = lambda f, samp, suff: f([bool(samp[i + suff]) for i in ("multi_4", "multi_4", "webext_multi_1", "webext_multi_4")])
frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("cohort", "has_webextension", "system/os/name"),
with_processes=True)\
.collect())
ret = defaultdict(dict)
for os in np.unique(frame['system/os/name']):
print_with_markdown("# {}".format(os))
frame_os = frame[frame['system/os/name'] == os]
# multi with no webextenions
multi1 = frame_os[frame_os.cohort=="multiBucket1"]
multi4 = frame_os[frame_os.cohort=="multiBucket4"]
# multi with webextensions
we_multi1 = frame_os[frame_os.cohort=="webextensions-multiBucket1"]
we_multi4 = frame_os[frame_os.cohort=="webextensions-multiBucket4"]
for histogram in histogram_names:
if histogram not in we_multi1.columns:
continue
## Remove the property path from the histogram name for display purposes.
hist_name = hist_base_name(histogram)
if type(hist_name) == list:
## Key was given for keyed histogram.
hist_str = "{}/{}".format(link_to_histogram(hist_name[0]), hist_name[1])
hist_name = hist_name[0]
else:
hist_str = hist_name
## Print a header for the block of graphs, including a link to the histogram definition.
print_with_markdown("Comparison for {}:".format(hist_str))
## Compare main, parent and child histograms
addons_hist_data = {
"multi_1_merged": multi1[histogram],
"multi_4_merged": multi4[histogram],
"webext_multi_1_merged": we_multi1[histogram],
"webext_multi_4_merged": we_multi4[histogram],
"multi_1_parent": multi1[histogram + "_parent"],
"multi_4_parent": multi4[histogram + "_parent"],
"webext_multi_1_parent": we_multi1[histogram + "_parent"],
"webext_multi_4_parent": we_multi4[histogram + "_parent"],
"multi_1_child": multi1[histogram + "_children"],
"multi_4_child": multi4[histogram + "_children"],
"webext_multi_1_child": we_multi1[histogram + "_children"],
"webext_multi_4_child": we_multi4[histogram + "_children"],
}
for htype in addons_hist_data:
addons_hist_data[htype] = addons_hist_data[htype].dropna()
## Print some information on sample sizes.
sample_sizes = { htype: len(hdata) for htype, hdata in addons_hist_data.iteritems() }
print("{} multi_1 profiles have this histogram.".format(
sample_size_str(sample_sizes["multi_1_merged"], cohort_sizes.get(MULTI_1))))
print("{} multi_4 profiles have this histogram.".format(
sample_size_str(sample_sizes["multi_4_merged"], cohort_sizes.get(MULTI_4))))
print("{} webext_multi_1 profiles have this histogram.".format(
sample_size_str(sample_sizes["webext_multi_1_merged"], cohort_sizes.get(WEBEXTENSION_MULTI_1))))
print("{} webext_multi_4 profiles have this histogram.".format(
sample_size_str(sample_sizes["webext_multi_4_merged"], cohort_sizes.get(WEBEXTENSION_MULTI_4))))
## If either group has no data, nothing more to do.
if not has_hist(any, sample_sizes, "_merged"):
print "No hists found"
continue
print("{} multi_1 profiles have the parent histogram.".format(
sample_size_str(sample_sizes["multi_1_parent"], cohort_sizes.get(MULTI_1))))
print("{} multi_4 profiles have the parent histogram.".format(
sample_size_str(sample_sizes["multi_4_parent"], cohort_sizes.get(MULTI_4))))
print("{} webext_multi_1 profiles have the parent histogram.".format(
sample_size_str(sample_sizes["webext_multi_1_parent"], cohort_sizes.get(WEBEXTENSION_MULTI_1))))
print("{} webextx_multi_4 profiles have the parent histogram.".format(
sample_size_str(sample_sizes["webext_multi_4_parent"], cohort_sizes.get(WEBEXTENSION_MULTI_4))))
print("{} multi_1 profiles have the child histogram.".format(
sample_size_str(sample_sizes["multi_1_child"], cohort_sizes.get(MULTI_1))))
print("{} multi_4 profiles have the child histogram.".format(
sample_size_str(sample_sizes["multi_4_child"], cohort_sizes.get(MULTI_4))))
print("{} webext_multi_1 profiles have the child histogram.".format(
sample_size_str(sample_sizes["webext_multi_1_child"], cohort_sizes.get(WEBEXTENSION_MULTI_1))))
print("{} webextx_multi_4 profiles have the child histogram.".format(
sample_size_str(sample_sizes["webext_multi_4_child"], cohort_sizes.get(WEBEXTENSION_MULTI_4))))
has_parents = has_hist(all, sample_sizes, "_parent")
has_children = has_hist(all, sample_sizes, "_child")
result = None
## Compare merged histograms, groups have either no parents or children
if has_children and has_parents:
result = compare_histogram(hist_name + " (merged)",
addons_hist_data["webext_multi_1_merged"],
addons_hist_data["webext_multi_4_merged"],
addons_hist_data["multi_1_merged"],
addons_hist_data["multi_4_merged"],
**kwargs)
ret[os][hist_name + " (merged)"] = result
if has_parents:
result = compare_histogram(hist_name + " (parent)",
addons_hist_data["webext_multi_1_parent"],
addons_hist_data["webext_multi_4_parent"],
addons_hist_data["multi_1_parent"],
addons_hist_data["multi_4_parent"],
**kwargs)
ret[os][hist_name + " (parent)"] = result
if has_children:
result = compare_histogram(hist_name + " (children)",
addons_hist_data["webext_multi_1_child"],
addons_hist_data["webext_multi_4_child"],
addons_hist_data["multi_1_child"],
addons_hist_data["multi_4_child"],
**kwargs)
ret[os][hist_name + " (child)"] = result
return ret
def compare_scalars(metric, we_multi1, we_multi4, multi1=None, multi4=None, unit="units"):
""" Prints info about the median difference between the groups, together with a p-value
for testing the difference.
Optionally include a string indicating the units the metric is measured in.
If data is supplied, also print a comparison for non-addons cohorts.
"""
def fix_nested_series(series):
x = np.zeros(len(series.iloc[0]))
for i in series:
x += i
return x
ret = defaultdict(dict)
col = we_multi1.columns[0]
we_multi1 = we_multi1.dropna()[col]
we_multi4 = we_multi4.dropna()[col]
if len(we_multi1) == 0 or len(we_multi4) == 0:
print("Cannot run comparison: one of the groups has no non-missing observations.")
return
print("Comparison for {}{} (with webextensions):\n".format(metric, " ({})".format(unit) if unit != "units" else ""))
if type(we_multi1.iloc[0]) == pd.Series:
we_multi1 = fix_nested_series(we_multi1)
if type(we_multi4.iloc[0]) == pd.Series:
we_multi4 = fix_nested_series(we_multi4)
we1_n = len(we_multi1[we_multi1 > 0])
we4_n = len(we_multi4[we_multi4 > 0])
we1_dec = np.percentile(we_multi1, np.arange(0, 100, 10))
we4_dec = np.percentile(we_multi4, np.arange(0, 100, 10))
mdiff = median_diff(we_multi1, we_multi4)
print("- Median with 1 content process is {:.3g} {} {} median with 4 processes."\
.format(
#abs(mdiff),
mdiff,
unit,
#"higher than" if mdiff >= 0 else "lower than"
"different from"))
print("- This is a relative difference of {:.1f}%.".format(float(mdiff) / we4_dec[5] * 100))
print("- Multi 1 group median is {:.4g}, Multi 4 group median is {:.4g}.".format(we1_dec[5], we4_dec[5]))
pvalue = grouped_permutation_test(median_diff, [we_multi1, we_multi4], num_samples=10000)
print("\n(with webextensions) The probability of this difference occurring purely by chance is {:.3f}."\
.format(pvalue))
mname = col.split("/")[-1]
ret = {
'dec_1': we1_dec,
'dec_4': we4_dec,
'n_1': we1_n,
'n_4': we4_n,
'pvalue_14': pvalue
}
if multi1 is not None and multi4 is not None:
## Include a comparison between non-addon cohorts.
multi1_s = multi1.dropna()[col]
multi4_s = multi4.dropna()[col]
if len(multi1_s) > 0 and len(multi4_s) > 0:
if type(multi1_s.iloc[0]) == pd.Series:
multi1_s = fix_nested_series(multi1_s)
if type(multi4_s.iloc[0]) == pd.Series:
multi4_s = fix_nested_series(multi4_s)
m1_n = len(multi1_s[multi1_s > 0])
m4_n= len(multi4_s[multi4_s > 0])
m1_dec = np.percentile(multi1_s, np.arange(0, 100, 10))
m4_dec = np.percentile(multi4_s, np.arange(0, 100, 10))
mdiff_std = median_diff(multi1_s, multi4_s)
print("\nFor cohorts with no webextensions, median with 1 content process is {:.3g} {} ({:.1f}%) {} median with 4 processes"\
.format(
#abs(mdiff_std),
mdiff_std,
unit,
float(mdiff_std) / m4_dec[5] * 100,
#"higher than" if mdiff_std >= 0 else "lower than"
"different from"))
print("- This is a relative difference of {:.1f}%."\
.format(float(mdiff_std) / m4_dec[5] * 100))
print("- Multi 1 group median is {:.4g}, Multi 4 group median is {:.4g}."\
.format(m1_dec[5], m4_dec[5]))
pvalue_std = grouped_permutation_test(median_diff, [multi1_s, multi4_s], num_samples=10000)
print("\n(without webextensions) The probability of this difference occurring purely by chance is {:.3f}."\
.format(pvalue_std))
ret["dec_1_std"] = m1_dec
ret["dec_4_std"] = m4_dec
ret["n_1_std"] = m1_n
ret["n_4_std"] = m4_n
ret["pvalue_14_std"] = pvalue_std
return ret
def link_to_histogram(hist_name):
""" Create a link to the histogram definition in Markdown. """
return "[{}](https://dxr.mozilla.org/mozilla-central/search?q={}+file%3AHistograms.json&redirect=true)"\
.format(hist_name, hist_name)
def hist_base_name(path_to_histogram):
""" Remove any path components from histogram name.
If histogram is specified as a path in the payload, with separator '/',
remove everything but the last component (the actual name).
However, if the histogram is keyed, and specified with a key, return
[histname, key].
"""
path_to_histogram = path_to_histogram.rsplit("/")
if len(path_to_histogram) > 1 and path_to_histogram[-3] == "keyedHistograms":
## There was a keyedHistogram name and key given.
return path_to_histogram[-2:]
return path_to_histogram[-1]
## Hack to render links in code output.
from IPython.display import Markdown, display
def print_with_markdown(md_text):
""" Print Markdown text so that it renders correctly in the cell output. """
display(Markdown(md_text))
def sample_size_str(sample_size, cohort_size=None):
""" Convert a sample size to a string representation, including a percentage if available. """
if sample_size == 0:
return "No"
if cohort_size:
if sample_size == cohort_size:
return "All"
return "{} ({:.1f}%)".format(sample_size, float(sample_size) / cohort_size * 100)
return str(sample_size)
def get_cohort_dist(dataset):
cohort_counts = dataset.groupby("e10sCohort").count().collect()
dataset_count = sum(map(lambda r: r["count"], cohort_counts))
def cohort_proportions(r):
prop = r["count"] * 100.0 / dataset_count
return (r["e10sCohort"], r["count"], "{:.2f}%".format(prop))
print("\nTotal number of clients: {:,}".format(dataset_count))
d = pd.DataFrame(sorted(map(cohort_proportions, cohort_counts), key = lambda r: r[0]))
d.columns = ['cohort', 'count', 'pct']
return d
The derived dataset is computed from profiles on Beta 54 who belong to the {webextensions-}multiBucket{1, 4} e10s Cohorts. It contains a single record (ping) per client, which is randomly selected from among the client's pings during the date range.
# regenerated data and loaded into telemetry-test-bucket
dataset = sqlContext.read.parquet(
"s3://telemetry-test-bucket/e10s_experiment_view/multi_webExtensions_beta54_cohorts/v20170430_20170507/")
dataset.printSchema()
How many records are in the overall dataset?
dataset.count()
What are the cohorts, and how many clients do we have in each cohort?
get_cohort_dist(dataset)
WEBEXTENSION_MULTI_1 = u'webextensions-multiBucket1'
WEBEXTENSION_MULTI_4 = u'webextensions-multiBucket4'
MULTI_1 = u'multiBucket1'
MULTI_4 = u'multiBucket4'
Restrict to pings belonging to the Multi-WebExtensions experiment.
webext_exp_dataset = dataset.filter(\
"e10sCohort in ('%s','%s', '%s', '%s')" % (WEBEXTENSION_MULTI_1,
WEBEXTENSION_MULTI_4,
MULTI_1,
MULTI_4))
How many clients are left?
webext_exp_dataset.count()
We want to make sure that the pings tagged into the cohorts satisfy the basic assumptions of the experiment, as this not guaranteed. Both webextensions-multiBucket{1, 4} cohorts should have only WebExtension add-ons, and both multiBucket{1, 4} cohorts should have no add-ons. All profiles should have e10s enabled.
def e10s_status_check(settings, addons):
""" Check whether e10s is enabled, and whether there are add-ons. """
e10sEnabled = bool(json.loads(settings).get("e10sEnabled"))
aa = json.loads(addons).get("activeAddons")
# check if all addons (that are non-system) are webextensions
addon_status_lst = []
if aa:
for i in aa:
if not aa[i].get("isSystem"):
addon_status_lst.append(aa[i].get('isWebExtension'))
onlyWebExtension = all(addon_status_lst) if len(addon_status_lst) > 0 else False
return Row(
e10s_enabled = e10sEnabled,
only_webextension = onlyWebExtension
)
def bad_ping(cohort, settings, addons):
""" e10s should be enabled iff the profile is in the test cohort, and profiles should have active add-ons
if they are in the addons cohorts.
"""
check_data = e10s_status_check(settings, addons)
# must have e10s
is_bad = not check_data.e10s_enabled
if cohort.startswith("webextensions"):
is_bad = is_bad or not check_data.only_webextension
return is_bad
## Add a Column to the DF with the outcome of the check.
## This will be used to remove any bad rows after examining them.
from pyspark.sql.types import BooleanType
status_check_udf = fun.udf(bad_ping, BooleanType())
webext_exp_dataset_check = webext_exp_dataset.withColumn("badPing",
status_check_udf(webext_exp_dataset.e10sCohort,
webext_exp_dataset.settings,
webext_exp_dataset.addons))
If there are any bad pings, describe the problems and remove them from the dataset.
webext_exp_dataset_bad = webext_exp_dataset_check.filter("badPing")\
.select("e10sCohort", "settings", "addons")\
.rdd
has_bad = not webext_exp_dataset_bad.isEmpty()
if not has_bad:
print("No issues")
else:
check_counts = webext_exp_dataset_bad\
.map(lambda r: (r.e10sCohort, e10s_status_check(r.settings, r.addons)))\
.countByValue()
print("Issues:")
for k, v in check_counts.iteritems():
print("{}: {}".format(k, v))
if has_bad:
print("\nRemoving these pings from the dataset.")
webext_exp_dataset = webext_exp_dataset_check.filter("not badPing").drop("badPing")
print("The dataset now contains {} clients".format(webext_exp_dataset.count()))
What add-ons are present for the addons cohorts?
def get_active_addon_info(addons_str):
""" Return a list of currently enabled add-ons in the form (GUID, name, version, isSystem). """
addons = json.loads(addons_str)
addons = addons.get("activeAddons", {})
if not addons:
return []
return [(guid, meta.get("name"), meta.get("isSystem"), meta.get("isWebExtension"),
meta.get('version')) for guid, meta in addons.iteritems()]
def get_top_addons(df, cohort_filter, n_top=100):
cohort_num, cohort_table = dataset_installed_addons(
df.filter(cohort_filter),
n_top=n_top)
print("There were {:,} distinct add-ons installed across the '{}' cohort."\
.format(cohort_num, cohort_filter))
cohort_table["n_installs"] = cohort_table["n_installs"]
cohort_table["pct_installed"] = cohort_table["pct_installed"]
return cohort_table
def dataset_installed_addons(data, n_top=100):
""" Extract add-on info from a subset of the main dataset, and generate a table of top add-ons
with installation counts.
Returns a Pandas DataFrame.
"""
data_addons = data.select("addons").rdd.map(lambda row: row["addons"])
data_addons.cache()
n_in_data = data_addons.count()
## Get counts by add-on ID/name/isSystem value.
addon_counts = data_addons.flatMap(get_active_addon_info)\
.map(lambda a: (a, 1))\
.reduceByKey(add)\
.map(lambda ((guid, name, sys, we, version), n): (guid, (name, sys, we, version, n)))
## Summarize using the most common name and isSystem value.
top_vals = addon_counts.reduceByKey(lambda a, b: a if a[-1] > b[-1] else b)\
.map(lambda (guid, (name, sys, we, version, n)): (guid, (name, sys, we, version)))
n_installs = addon_counts.mapValues(lambda (name, sys, we, version, n): n)\
.reduceByKey(add)
addon_info = top_vals.join(n_installs)\
.map(lambda (guid, ((name, sys, we, version), n)): {
"guid": guid,
"name": name,
"is_system": sys,
"is_webextension": we,
"version":version,
"n_installs": n,
"pct_installed": n / n_in_data * 100
})\
.sortBy(lambda info: info["n_installs"], ascending=False)
addon_info_coll = addon_info.collect() if not n_top else addon_info.take(n_top)
addon_info_table = pd.DataFrame(addon_info_coll)
addon_info_table = addon_info_table[["guid", "name", "version","is_system",
"is_webextension", "n_installs", "pct_installed"]]
## Number rows from 1.
addon_info_table.index += 1
n_addons = addon_info.count()
data_addons.unpersist()
return (n_addons, addon_info_table)
get_top_addons(webext_exp_dataset, "e10sCohort like 'webextensions%'", n_top=10)
What add-ons are present in the standard (non-addons) cohorts, if any?
get_top_addons(webext_exp_dataset, "e10sCohort in ('multiBucket1', 'multiBucket4')", n_top=10)
def row_2_ping(row):
# work around to get "processes" (child payloads) into payload
submission = json.loads(row.submission) if row.submission else {}
processes = submission.get("payload", {}).get("processes", {})
ping = {
"payload": {
"simpleMeasurements": json.loads(row.simpleMeasurements) if row.simpleMeasurements else {},
"histograms": json.loads(row.histograms) if row.histograms else {},
"keyedHistograms": json.loads(row.keyedHistograms) if row.keyedHistograms else {},
"childPayloads": json.loads(row.childPayloads) if row.childPayloads else {},
"threadHangStats": json.loads(row.threadHangStats) if row.threadHangStats else {},
"processes": processes
},
"e10s": True if row.e10sCohort.endswith("test") else False,
"has_webextension": True if row.e10sCohort.startswith("webextensions") else False,
"system": json.loads(row.system),
"cohort": row.e10sCohort
}
return ping
def notxp(p):
os = p.get("system", {}).get("os", {})
return os["name"] != "Windows_NT" or os["version"] != "5.1"
subset = webext_exp_dataset.rdd.map(row_2_ping).filter(notxp)
def add_gecko_activity(ping):
try:
uptime = ping["payload"].get("simpleMeasurements", {}).get("totalTime", -1) / 60
except TypeError:
uptime = 0
if uptime <= 0:
return ping
def get_hangs_per_minute(threads, thread_name, uptime):
for thread in threads:
if thread["name"] == thread_name:
activity = thread["activity"]["values"]
if activity:
histogram = pd.Series(activity.values(), index=map(int, activity.keys())).sort_index()
# 255 is upper bound for 128-255ms bucket.
return histogram[histogram.index >= 255].sum() / uptime
return None
threads = ping["payload"].get("threadHangStats", {})
ping["parent_hangs_per_minute"] = get_hangs_per_minute(threads, "Gecko", uptime)
child_payloads = ping["payload"].get("childPayloads", [])
child_hangs_per_minute = []
for payload in child_payloads:
try:
child_uptime = payload.get("simpleMeasurements", {}).get("totalTime", -1) / 60
except TypeError:
return ping
if child_uptime <= 0:
continue
child_threads = payload.get("threadHangStats", {})
child_hangs = get_hangs_per_minute(child_threads, "Gecko_Child", child_uptime)
if child_hangs:
child_hangs_per_minute.append(child_hangs)
if len(child_hangs_per_minute) > 0:
ping["child_hangs_per_minute"] = sum(child_hangs_per_minute) / len(child_hangs_per_minute)
return ping
subset = subset.map(add_gecko_activity)
At this point, how many clients are left in each cohort? Key first by cohort.
subset = subset.filter(lambda x: x is not None).map(lambda r: (r["cohort"], r))
cohort_sizes = subset.countByKey()
cohort_sizes
Sample from minority groups to generate groups where $n_1 \approx n_2, ..., \approx n_i $
multi_prop1 = cohort_sizes[WEBEXTENSION_MULTI_1] / cohort_sizes[MULTI_1]
multi_prop4 = cohort_sizes[WEBEXTENSION_MULTI_4] / cohort_sizes[MULTI_4]
sampling_props = {
MULTI_1: multi_prop1,
MULTI_4: multi_prop4,
WEBEXTENSION_MULTI_1: 1,
WEBEXTENSION_MULTI_4: 1
}
subset = subset.sampleByKey(False, sampling_props).cache()
print 'Sampling the following proportions from each group:'
sampling_props
Now compute the final cohort sizes, and wrap them into the histogram comparison functions.
e10s_addon_cohort_sizes = subset.countByKey()
## Remove the cohort label key from the dataset.
subset = subset.map(lambda r: r[1])
print("Final cohort sizes:")
print(" - multi-1 (no webextensions): {}".format(e10s_addon_cohort_sizes[MULTI_1]))
print(" - multi-4 (no webextensions): {}".format(e10s_addon_cohort_sizes[MULTI_4]))
print(" - multi-1 (webextensions): {}".format(e10s_addon_cohort_sizes[WEBEXTENSION_MULTI_1]))
print(" - multi-4 (webextensions): {}".format(e10s_addon_cohort_sizes[WEBEXTENSION_MULTI_4]))
def compare_histograms(pings, *histogram_names, **kwargs):
return compare_e10s_histograms(pings, e10s_addon_cohort_sizes, *histogram_names, **kwargs)
def compare_count_histograms(pings, *histogram_names, **kwargs):
return compare_e10s_count_histograms(pings, e10s_addon_cohort_sizes, *histogram_names, **kwargs)
def fix_hist(ping):
""" Rename the histogram for e10s profiles. """
hist = ping.get("payload", {}).get("histograms", {})
if "FX_TAB_SWITCH_TOTAL_E10S_MS" in hist and "FX_TAB_SWITCH_TOTAL_MS" not in hist:
hist["FX_TAB_SWITCH_TOTAL_MS"] = hist["FX_TAB_SWITCH_TOTAL_E10S_MS"]
return ping
subset = subset.map(fix_hist)
results = []
results.append(compare_histograms(subset, "payload/histograms/GC_MAX_PAUSE_MS"))
results.append(compare_histograms(subset, "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE"))
ier = compare_histograms(subset, "payload/histograms/INPUT_EVENT_RESPONSE_MS")
results.append(ier)
# fpl = compare_histograms(subset, "payload/histograms/FX_PAGE_LOAD_MS")
# results.append(fpl)
mt = compare_histograms(subset, "payload/histograms/MEMORY_TOTAL")
results.append(mt)
mvmc = compare_histograms(subset, "payload/histograms/MEMORY_VSIZE_MAX_CONTIGUOUS")
results.append(mvmc)
mdac = compare_histograms(subset, "payload/histograms/MEMORY_DISTRIBUTION_AMONG_CONTENT")
results.append(mdac)
ftst = compare_histograms(subset, "payload/histograms/FX_TAB_SWITCH_TOTAL_MS")
results.append(ftst)
ftsu = compare_histograms(subset, "payload/histograms/FX_TAB_SWITCH_UPDATE_MS")
results.append(ftsu)
ftssv = compare_histograms(subset, "payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_MS")
results.append(ftssv)
ftssvl = compare_histograms(subset, "payload/histograms/FX_TAB_SWITCH_SPINNER_VISIBLE_LONG_MS")
results.append(ftssvl)
ftrnd = compare_histograms(subset, "payload/histograms/FX_TAB_REMOTE_NAVIGATION_DELAY_MS")
results.append(ftrnd)
ssnc = compare_count_histograms(subset, "payload/histograms/SLOW_SCRIPT_NOTICE_COUNT")
results.append(ssnc)
sspc = compare_count_histograms(subset, "payload/histograms/SLOW_SCRIPT_PAGE_COUNT")
results.append(sspc)
ssnd = compare_count_histograms(subset, "payload/histograms/SLOW_SCRIPT_NOTIFY_DELAY")
results.append(ssnd)
Wrangling summary data together for trending week over week
def get_summary_data(results):
'''
Unnests results to from one row for every os, probe, and cohort,
creating individual columns for each decile
'''
suffix_map = {
'_1' : 'webextensions-multiBucket1',
'_4': 'webextensions-multiBucket4',
'_1_std': 'multiBucket1',
'_4_std': 'multiBucket4'
}
ret = []
for result in results:
for os in result:
for probe in result[os]:
for suffix in suffix_map:
curr = {'os': os, 'probe':probe, 'cohort':suffix_map[suffix]}
curr['n'] = result[os][probe]["n" + suffix]
decile = result[os][probe]["dec" + suffix]
p = 'pvalue_14_std' if 'std' in suffix else 'pvalue_14'
curr['pvalue'] = result[os][probe][p]
for i in range(10):
curr['decile_{}'.format(i)] = decile[i]
ret.append(curr)
return ret
s = get_summary_data(results)
s = pd.DataFrame(s)
cols = ['probe', 'os', 'cohort', 'n', 'pvalue'] + ['decile_{}'.format(i) for i in range(10)]
s = s.sort_values(['probe', 'os', 'cohort'])[cols]
s.head()
s.to_csv("summary_multi_1.csv", index=False)
concerns = s[s.pvalue <= .05][['probe', 'os', 'cohort', 'n', 'pvalue']]
if len(concerns) > 0:
concerns.to_html('concerns.html', index=False)
def get_stacks(subset):
def yield_ping_stacks(ping):
for thread in ping["payload"]["threadHangStats"]:
if thread["name"] != "Gecko":
continue
for hang in thread["hangs"]:
if not hang["stack"]:
continue
values = hang["histogram"]["values"]
histogram = pd.Series(values.values(), index=map(int, values.keys())).sort_index()
min_ms = 100
over_min_ms_count = histogram[histogram.index > min_ms].sum()
yield (tuple(hang["stack"]), over_min_ms_count)
return subset.flatMap(yield_ping_stacks).reduceByKey(lambda a, b: a + b).collectAsMap()
wm1 = get_stacks(subset.filter(lambda x: x['cohort'] == WEBEXTENSION_MULTI_1))
wm4 = get_stacks(subset.filter(lambda x: x['cohort'] == WEBEXTENSION_MULTI_4))
m1 = get_stacks(subset.filter(lambda x: x['cohort'] == MULTI_1))
m4 = get_stacks(subset.filter(lambda x: x['cohort'] == MULTI_4))
def get_top_stacks_per_group(c1, c2, c3, c4, count,
names=['Addons/E10s', 'Addons/NoE10s', 'NoAddons/E10s', 'NoAddons/NoE10s']):
'''
Takes the top <count> stacks per group in c1...4, ordered by
the ordering of the <names> list
returns pandas DF indexed by the stack type
'''
c1_total_count = sum(c1.values())
c2_total_count = sum(c2.values())
c3_total_count = sum(c3.values())
c4_total_count = sum(c4.values())
result = []
index = []
for c, c1_stack_count in Counter(c1).most_common(count):
row = {}
c2_stack_count = c2.get(c, 0)
c3_stack_count = c3.get(c, 0)
c4_stack_count = c4.get(c, 0)
row = {names[0]: 100.0 * c1_stack_count / c1_total_count,
names[1]: 100.0 * c2_stack_count / c2_total_count,
names[2]: 100.0 * c3_stack_count / c3_total_count,
names[3]:100.0 * c4_stack_count / c4_total_count}
index.append("<br>".join(reversed(c)))
result.append(row)
df = pd.DataFrame(result)[names]
df.index = index
return df.applymap(lambda x: '{:.2f}'.format(float(x)) + '%')
def heat_map_df(df, color='red', css='background', null_color='white'):
'''
Colors in background of a percentage table
based on value.
returns stylized dataframe (different from pd.Dataframe)
'''
color = Color(color)
dfmax = max(df.applymap(lambda x: float(x.split('%')[0])).max())
n_colors = int(math.ceil(dfmax))
gradient = list(reversed(list(color.range_to(Color(null_color),n_colors+1))))
gradient_map = {}
for i in range(n_colors+1):
gradient_map[i] = gradient[i]
css_style = lambda x: "%s: %s" % \
(css, gradient_map[int(math.ceil(float(x.split('%')[0])))])
df1 = df.style.applymap(css_style)
return df1
def heatmapify(df):
'''
Takes a pandas df and applies heatmap function to cells
with some cleaner css
returns raw html
'''
css_header = '''
<style type="text/css" >
table {
font-family:Arial, Helvetica, sans-serif;
font-size:12px;
background:#eaebec;
margin:0 auto;
border:#ccc 1px solid;
-moz-border-radius:3px;
-webkit-border-radius:3px;
border-radius:3px;
}
table th {
padding:21px 25px 22px 25px;
border-top:1px solid #fafafa;
border-bottom:1px solid #e0e0e0;
}
table th:first-child {
text-align: left;
padding-left:20px;
}
table tr {
text-align: center;
padding-left:20px;
}
table td {
padding:18px;
border-top: 1px solid #ffffff;
border-bottom:1px solid #e0e0e0;
border-left: 1px solid #e0e0e0;
}
tr:hover{
background-color: #CBCDCF;
}
'''
formatted_table = css_header + '\n' + heat_map_df(df).render()\
.split('<style type="text/css" >')[1]
return formatted_table
wm1s = get_top_stacks_per_group(wm1, wm4, m1, m4, 25,
names = [WEBEXTENSION_MULTI_1, WEBEXTENSION_MULTI_4, MULTI_1, MULTI_4])
wm4s = get_top_stacks_per_group(wm4, wm1, m1, m4,25,
names = [WEBEXTENSION_MULTI_4, WEBEXTENSION_MULTI_1, MULTI_1, MULTI_4])
m1s = get_top_stacks_per_group(m1, m4, wm1, wm4, 25,
names = [MULTI_1, MULTI_4, WEBEXTENSION_MULTI_1, WEBEXTENSION_MULTI_4])
m4s = get_top_stacks_per_group(m4, m1, wm1, wm4,25,
names = [MULTI_4, MULTI_1, WEBEXTENSION_MULTI_1, WEBEXTENSION_MULTI_4])
%%bash
pip install colour
import os
from colour import Color
if not os.path.exists('./html'):
os.makedirs('html')
cohorts = (wm1s, wm4s, m1s, m4s)
filenames = ('wm1.html', 'wm4.html', 'm1.html', 'm4.html')
for i in range(4):
html = heatmapify(cohorts[i])
with open('./html/' + filenames[i], 'w') as f:
f.write(html)
%%bash
ls
%%bash
jupyter nbconvert --to html e10sMulti_experiment-Copy1.ipynb --output-dir html/
%%bash
aws s3 cp --recursive ./html s3://telemetry-test-bucket/e10s_experiment_view/multi_webExtensions_beta54_cohorts/week1-html